1
2
3
4 package uk.ac.roe.antigen.utils;
5
6 import java.io.IOException;
7 import java.io.Reader;
8 import java.io.StringReader;
9 import java.util.HashMap;
10 import java.util.Map;
11
12 import javax.swing.text.MutableAttributeSet;
13 import javax.swing.text.html.HTML;
14 import javax.swing.text.html.HTMLEditorKit;
15 import javax.swing.text.html.parser.ParserDelegator;
16
17 public class HtmlToTextParser {
18
19 private TagRemovalParserCallback parserCallBack = new TagRemovalParserCallback();;
20
21 private ParserDelegator parser = new ParserDelegator();
22
23 private StringBuffer contentBuffer;;
24
25 /***
26 * @param input
27 * @throws IOException
28 */
29 public String parse(Reader input) throws IOException {
30 contentBuffer = new StringBuffer();
31 parser.parse(input, parserCallBack, false);
32 return contentBuffer.toString();
33 }
34
35 /***
36 * Simple test
37 *
38 * @param args
39 * ignored
40 * @throws IOException
41 */
42 public static void main(String[] args) throws IOException {
43 String htmlText = "<html><head></head><body>" + "<h1>Heading 1</h1>"
44 + "<h2>Heading 2</h2>" + "Some <b>bold</b> test and a new<br>"
45 + "line in <em>italics</em>" + "<p>A separate paragraph</p>"
46 + "separated by a <hr> line, "
47 + "a <a href='http://www.astrogrid.org'>link</a>, "
48 + "and a <h3>third heading</h3> to finish.";
49
50 Reader input = new StringReader(htmlText);
51 HtmlToTextParser parser = new HtmlToTextParser();
52 String output = parser.parse(input);
53 System.out.println(output);
54
55 }
56
57 private class TagRemovalParserCallback extends HTMLEditorKit.ParserCallback {
58
59 private Map headings = new HashMap();
60
61 public TagRemovalParserCallback() {
62 headings.put(HTML.Tag.H1,"=");
63 headings.put(HTML.Tag.H2,"-");
64 headings.put(HTML.Tag.H3,".");
65 }
66 /***
67 * Keep track of the number of chars in a heading
68 */
69 private int charCount=0;
70 private int indentationLevel=0;
71
72 private static final int LINELENGTH = 40;
73
74 private static final char BOLDCHAR = '*';
75
76 private static final char ITALCHAR = '_';
77
78 public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs,
79 int pos) {
80 if (tag == HTML.Tag.BR || tag == HTML.Tag.P) {
81 contentBuffer.append("\n");
82 }
83 if (tag == HTML.Tag.HR) {
84 contentBuffer.append("\n");
85 for (int i = 0; i < LINELENGTH; ++i) {
86 contentBuffer.append("_");
87 }
88 contentBuffer.append("\n");
89 }
90
91 }
92
93 public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs,
94 int pos) {
95 if (tag == HTML.Tag.B) {
96 contentBuffer.append(BOLDCHAR);
97 }
98 if (tag == HTML.Tag.EM) {
99 contentBuffer.append(ITALCHAR);
100 }
101 if (tag == HTML.Tag.P) {
102 contentBuffer.append('\n');
103 }
104 if (headings.containsKey(tag)) {
105 contentBuffer.append('\n');
106 charCount = 0;
107 }
108 if (tag == HTML.Tag.A) {
109 String link = (String) attrs.getAttribute(HTML.Attribute.HREF);
110 contentBuffer.append("["+link+"]");
111 }
112 if (tag == HTML.Tag.LI) {
113 contentBuffer.append("\n");
114 for (int i=0;i<indentationLevel;++i) {
115 contentBuffer.append(" ");
116 }
117 contentBuffer.append("o ");
118 }
119 if (tag == HTML.Tag.UL) {
120 indentationLevel++;
121 }
122 }
123
124 public void handleEndTag(HTML.Tag tag, int pos) {
125 if (tag == HTML.Tag.B) {
126 contentBuffer.append(BOLDCHAR);
127 }
128 if (tag == HTML.Tag.EM) {
129 contentBuffer.append(ITALCHAR);
130 }
131 if (tag == HTML.Tag.P) {
132 contentBuffer.append('\n');
133 }
134 if (headings.containsKey(tag)) {
135 contentBuffer.append('\n');
136 for (int i=0;i<charCount;++i) {
137 contentBuffer.append((String)headings.get(tag));
138 }
139 charCount = 0;
140 contentBuffer.append('\n');
141 }
142
143 if (tag == HTML.Tag.UL) {
144 indentationLevel--;
145 contentBuffer.append('\n');
146 }
147 }
148
149 public void handleText(char[] data, int pos) {
150 contentBuffer.append(data);
151 charCount+=data.length;
152 }
153
154 }
155
156 }